#Python script to compare the near neighbor counties across demographics.

import pandas as pd
import numpy as np
import json
import gspread
import copy
import re
import os;
import statsmodels.stats.descriptivestats as smsd
from datetime import datetime
from oauth2client.service_account import ServiceAccountCredentials
import statsmodels.api as sm
from scipy import stats
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
from scipy.stats import median_test
import statsmodels.stats.descriptivestats as smsd


from statsmodels.formula.api import ols
from linearmodels import PanelOLS

basedir = os.getcwd() + '\\'
currentdir = 'Folder where data is stored CurrentData\\'
datadir = 'Folder with base data DataCreation\\'
revisedir = 'Folder you want the results put into\\'
print("Did you check the date of the file you are using to create the database with?")

#For the calculation of the ratio table see NeighborComparison2.py for the code.

#Calculate lagged growth in the fatality rate. Put into a table along the lines the referee wants.
#Need to change to first difference. Sample data in the file has been uploaded. 
rawdat = pd.read_csv(datadir + 'COVIDRegressionDataDiffW2W20210322Mini.csv',low_memory=False,dtype={'countyFIPS':str})
rawdat = rawdat.rename(columns={'date':'Date'})
rawdat.loc[:,'Date'] = pd.to_datetime(rawdat['Date'],format='%m/%d/%Y')
#rawdat.rename(columns={'lag1':'GrowthFatRatelag1','lag2':'GrowthFatRatelag2','lag3':'GrowthFatRatelag3','lag4':'GrowthFatRatelag4'},inplace=True)

########################################
#Comment out to keep all data. Leave in to require 1 fatality in a county prior to running.
#rawdat = rawdat.loc[rawdat['TotDeathsPerCap']>0,:]

#######################################

#Get the county pairs. The county pairs are in CountyHedonicNeighborsMile_100_NB_HD2.csv.
pairdat = pd.read_csv(currentdir+'CountyHedonicNeighborsMile_100_NB_HD2.csv',low_memory=False,dtype={'countyFIPS':str,'neighborFIPS':str})
pairdat = pairdat.loc[pairdat['countyFIPS'].isin(rawdat['countyFIPS']) & pairdat['neighborFIPS'].isin(rawdat['countyFIPS']),:]
rawdat = pd.merge(rawdat,pairdat,on='countyFIPS',how='right')

posFatalityFIPS = pd.DataFrame(rawdat.loc[rawdat['GrowthFatRate']>0,['countyFIPS','GrowthFatRate']].groupby('countyFIPS').min())
posFatalityFIPS.columns = ['minFatRate']

rawdat = rawdat.loc[rawdat['countyFIPS'].isin(posFatalityFIPS.index) & rawdat['neighborFIPS'].isin(posFatalityFIPS.index),:]

policies = [x for x in list(rawdat) if (x.startswith('Risk') or x.startswith('Gather') or ('Mask' in x) or x.startswith('Emergency') or\
	x.startswith('Parks') or ('Nursing' in x) or x.startswith('BusOpenRevStart') or x.startswith('StayHomeStart') or x.startswith('ElectiveStart') or\
	x.startswith('Phase') or x.startswith('RestaurantsZero') or x.startswith('BarsZero') or x.startswith('GymsZero') or x.startswith('SpasZero') or\
	x.startswith('BusClose')) and 'End' not in x and x.startswith('GatherNoMaxStart')==False]


startDate = '01/01/2200'
rawdat.loc[:,[x for x in list(rawdat) if 'Rest' in x]] = rawdat.loc[:,[x for x in list(rawdat) if 'Rest' in x]].fillna('01/01/2200')
startDate = pd.to_datetime(startDate, format='%m/%d/%Y')
for p in policies+[x for x in list(rawdat) if 'Rest' in x]:
	rawdat.loc[:,p] = pd.to_datetime(rawdat[p],format='%m/%d/%Y')

#Sample data has been uploaded in DummyDataDiffPartialTime20210329Mini.csv.
dummydat = pd.read_csv(datadir + 'DummyDataDiffPartialTime20210329Mini.csv',low_memory=False,dtype={'countyFIPS':str})
dummydat.loc[:,'Date'] = pd.to_datetime(dummydat['Date'],format='%Y-%m-%d')
dumbarrest = [x for x in list(dummydat) if 'dRestaurants' in x or 'dBars' in x or 'wRestaurants' in x or 'wBars' in x]
rawdat = pd.merge(rawdat,dummydat[['countyFIPS','Date',]+dumbarrest],on=['countyFIPS','Date'],how='left')
rawdat['dbarCrestCAll'] = rawdat.loc[:,[x for x in list(rawdat) if 'dBarsZero' in x or 'dRestaurantsZero' in x]].sum(axis='columns')
rawdat.loc[rawdat['dbarCrestCAll']==1,'dbarCrestCAll'] = 0
rawdat.loc[rawdat['dbarCrestCAll']==2,'dbarCrestCAll'] = 1
rawdat['dbarCrestOpenAll'] = rawdat.loc[:,[x for x in list(rawdat) if 'dBarsZero' in x]].sum(axis='columns')*(1-rawdat.loc[:,[x for x in list(rawdat) if 'dRestaurantsZero' in x]].max(axis='columns'))
#For each county find the maximum date in BarsZeroStart and RestaurantsZeroStart with wBarsZero and wRestaurantsZero both positive.
wBarsZero = [x for x in list(rawdat) if 'wBarsZeroStart' in x]
wRestZero = [x for x in list(rawdat) if 'wRestaurantsZeroStart' in x]
wRestOpen = [x for x in list(rawdat) if 'wRestaurantsZeroStart' not in x and x.startswith('wRestaurants')]

fipsList = sorted(list(set(rawdat['countyFIPS'])))
for i in ['1','2','3','4']:
	rawdat['barsCrestC'+i] = np.nan
	rawdat['barsCrestOpen'+i] = np.nan

for f in fipsList:
	rcounty = rawdat.loc[rawdat['countyFIPS']==f,:]
	newAllClose = rcounty['dbarCrestCAll'].diff(1)
	countyAllClose = []
	if rcounty.loc[rcounty.index.min(),'dbarCrestCAll']==1:
		countyAllClose = [rcounty.loc[rcounty.index.min(),'Date']]

	countyAllClose = countyAllClose + list(rcounty.loc[newAllClose==1,'Date'])
	for i in ['1','2','3','4']:
		if int(i)<=len(countyAllClose):
		      rawdat.loc[rawdat['countyFIPS']==f,'barsCrestC'+i] = list(countyAllClose)[int(i)-1]

	newOneOpen = rcounty['dbarCrestOpenAll'].diff(1)
	countyOneOpen = []
	if rcounty.loc[rcounty.index.min(),'dbarCrestOpenAll']==1:
	       countyOneOpen = [rcounty.loc[rcounty.index.min(),'Date']]

	countyOneOpen = countyOneOpen + list(rcounty.loc[newOneOpen==1,'Date'])
	for i in ['1','2','3','4']:
		if int(i)<=len(countyOneOpen):
		      rawdat.loc[rawdat['countyFIPS']==f,'barsCrestOpen'+i] = list(countyOneOpen)[int(i)-1]


for i in ['1','2','3','4']:
	policies = policies + ['barsCrestC'+i,'barsCrestOpen'+i]



mainpolicies = sorted(set([x.strip('0123456789') for x in policies]))
mainpolicies = [x for x in mainpolicies if x != 'RestaurantsZeroStart' and x != 'BarsZeroStart' and x != 'Phase4Start']
mainpolicies = ['barsCrestC','barsCrestOpen'] + mainpolicies 

indexPolicy = {\
'StayHomeStart':'Stay at Home', \
'EmergencyStart':'State of Emergency', \
'NursingMustAcceptStart':'Nursing Accept Pos.', \
'NursingHomeStart':'No Nursing Visits', \
'EmployeeMaskStart':'Employees Masks', \
'ResidentMaskRecStart':'Masks Recommended', \
'ResidentMaskMandStart':'Mandatory Masks',
'ParksStart':'Beaches or Parks Closed', \
'ElectiveStart':'No Elective Procedures', \
'barsCrestC':'Restaurants and Bars Closed', \
'barsCrestOpen':'Bars Closed/Rest. Open', \
'GymsZeroStart':'Gyms Closed', \
'SpasZeroStart':'Spas Closed', \
'GatherMax10Start':'Gatherings Limited<=10', \
'GatherMax100Start':'No Gatherings Limit<=100', \
'GatherMax100plusStart':'No Gatherings Limit>100', \
'BusCloseStart':'Risk Level 1 Closed', \
'Phase1Start':'Risk Level 2 Closed', \
'Phase2Start':'Risk Level 3 Closed', \
'Phase3Start':'Risk Level 4 Closed', \
'BusOpenRevStart':'Re-openings Reversed'}


indexControl = {\
'trigger':'Date of First Case (Days)',\
'White':'White',\
'Black':'Black',\
'Hispanic':'Hispanic',\
'Asian':'Asian',\
'NativeAmer':'Native American',\
'Other':'Other',\
'Age65plus':'Age65plus',\
'Age85plus':'Age85plus',\
'NurseTotPop':'Nursing Home Population',\
'PerCapIncome':'Per Capita Income',\
'Diabetes':'Diabetes',\
'Obese':'Obesity',\
'Smoke':'Smoke',\
'PopDensity':'Population Density',\
'HousingDensity':'Housing Density'}

lagall = ['lag'+str(x) for x in range(1,7)]
rawdat = rawdat[['Date','countyFIPS','neighborFIPS','GrowthFatRate']+lagall+policies+list(indexControl)[1:]]
rawdat[['GrowthFatRate']+lagall] = rawdat[['GrowthFatRate']+lagall]/10 #Divide growth rate by 10 to match Heather's data.

goodFIPS = rawdat['countyFIPS'].unique()
rawdat = rawdat.loc[(rawdat['countyFIPS'].isin(goodFIPS)) & (rawdat['neighborFIPS'].isin(goodFIPS)),:]
allfips = list(rawdat['countyFIPS'].unique())

fatbyPolicyIndex = []
for i in list(indexPolicy):
        fatbyPolicyIndex = fatbyPolicyIndex + [indexPolicy[i]]

lagfatbyPolicy = pd.DataFrame(index=fatbyPolicyIndex,columns=['mean','median','sd','neighbor_mean','neighbor_median','meanP','medianP','0.1SDP','MeanDiff/SD'])
lagfatbyPolicyDiff = lagfatbyPolicy.copy()

laglist = ['lag1','lag2','neighbor_lag1','neighbor_lag2']
purelag = [x for x in laglist if 'neighbor' not in x]
neighborlag = [x for x in laglist if 'neighbor' in x]

#Create output for lagged fatality growth t-1 as well as the change t-1 - t-2.
#lagfatbyPolicy has the former and lagfatbyPolicyDiff the latter.
for m in mainpolicies:
	print(m)
	allmain = sorted([x for x in policies if m in x])
	policyData = pd.DataFrame(columns=laglist)
	for f in allfips:
		fdat = rawdat.loc[rawdat['countyFIPS']==f,['Date','GrowthFatRate']+purelag+allmain]
		fdat_neighbor = rawdat.loc[rawdat['countyFIPS']==list(pairdat.loc[pairdat['countyFIPS']==f,'neighborFIPS'].values)[0],['Date']+purelag]
		if len(fdat)>0 and len(fdat_neighbor)>0:
			fdat.reset_index(drop=True,inplace=True)
			fdat_neighbor.reset_index(drop=True,inplace=True)
			triggerDates = []
			for a in allmain:
				if pd.isnull(fdat.loc[0,a]) == False:
					triggerDates.append(fdat.loc[(fdat['Date']-pd.to_datetime(fdat[a]))>pd.Timedelta(days=3),'Date'].min())
			for t in triggerDates:
				tcompare = max(t,fdat['Date'].min())
				newrow = pd.concat([fdat.loc[fdat['Date']==tcompare,purelag].reset_index(drop=True),fdat_neighbor.loc[fdat_neighbor['Date']==tcompare,purelag].reset_index(drop=True)],axis=1,sort=False,ignore_index=True)
				newrow[[2,3]] = newrow[[2,3]].fillna(0)        
				if newrow.isnull().values.any() == True:
					crash()
				newrow.columns = laglist
				policyData = policyData.append(newrow)
	meanLag = policyData['lag1'].mean()
	SDLag = policyData['lag1'].std()
	medianLag = policyData['lag1'].median()
	meanLagDiff = (policyData['lag1']-policyData[purelag[-1]]).mean()
	SDLagDiff = (policyData['lag1']-policyData[purelag[-1]]).std()
	medianLagDiff = (policyData['lag1']-policyData[purelag[-1]]).median()
	neighbor_meanLag = policyData['neighbor_lag1'].mean()
	neighbor_medianLag = policyData['neighbor_lag1'].median()
	neighbor_meanLagDiff = (policyData['neighbor_lag1']-policyData[neighborlag[-1]]).mean()
	neighbor_medianLagDiff = (policyData['neighbor_lag1']-policyData[neighborlag[-1]]).median()         
	meanPval = ttest_ind(policyData['lag1'],policyData['neighbor_lag1'])[1]
	medianPval = median_test(policyData['lag1'],policyData['neighbor_lag1'])[1]
	meanDiffPval = ttest_1samp((policyData['lag1']-policyData['neighbor_lag1']),0)[1]
	medianDiffPval = median_test((policyData['lag1']-policyData[purelag[-1]]),(policyData['neighbor_lag1']-policyData[neighborlag[-1]]))[1]
	meanPvalSD = ttest_1samp((policyData['lag1']-policyData['neighbor_lag1']),np.sign(meanLag-neighbor_meanLag)*SDLag/10)[1]
	meanDiffPvalSD = ttest_1samp((policyData['lag1']-policyData[purelag[-1]])-(policyData['neighbor_lag1']-policyData[neighborlag[-1]]),np.sign(meanLagDiff-neighbor_meanLagDiff)*SDLagDiff/10)[1]
	lagfatbyPolicy.loc[indexPolicy[m],'mean'] = meanLag
	lagfatbyPolicy.loc[indexPolicy[m],'median'] = medianLag
	lagfatbyPolicy.loc[indexPolicy[m],'sd'] = SDLag
	lagfatbyPolicyDiff.loc[indexPolicy[m],'mean'] = meanLagDiff
	lagfatbyPolicyDiff.loc[indexPolicy[m],'median'] = medianLagDiff
	lagfatbyPolicyDiff.loc[indexPolicy[m],'sd'] = SDLagDiff
	lagfatbyPolicy.loc[indexPolicy[m],'neighbor_mean'] = neighbor_meanLag
	lagfatbyPolicy.loc[indexPolicy[m],'neighbor_median'] = neighbor_medianLag
	lagfatbyPolicyDiff.loc[indexPolicy[m],'neighbor_mean'] = neighbor_meanLagDiff
	lagfatbyPolicyDiff.loc[indexPolicy[m],'neighbor_median'] = neighbor_medianLagDiff
	lagfatbyPolicy.loc[indexPolicy[m],'meanP'] = meanPval
	lagfatbyPolicy.loc[indexPolicy[m],'medianP'] = medianPval
	lagfatbyPolicy.loc[indexPolicy[m],'MeanDiff/SD'] = (meanLag-neighbor_meanLag)/SDLag
	lagfatbyPolicyDiff.loc[indexPolicy[m],'meanP'] = meanDiffPval
	lagfatbyPolicyDiff.loc[indexPolicy[m],'medianP'] = medianDiffPval
	lagfatbyPolicyDiff.loc[indexPolicy[m],'MeanDiff/SD'] = (meanLagDiff-neighbor_meanLagDiff)/SDLagDiff
	lagfatbyPolicy.loc[indexPolicy[m],'0.1SDP'] = 1-meanPvalSD
	lagfatbyPolicyDiff.loc[indexPolicy[m],'0.1SDP'] = 1-meanDiffPvalSD

lagfatbyPolicy.rename(indexPolicy,inplace=True)
lagfatbyPolicyDiff.rename(indexPolicy,inplace=True)
lagfatbyPolicyDiff.rename(index=lambda x: x+' Diff',inplace=True)
lagfatbyPolicy = pd.concat([lagfatbyPolicy,lagfatbyPolicyDiff],sort=False)

lagfatbyPolicy.to_csv(revisedir+'LagFatalityGrowthTable.csv',index=True,header=True,na_rep='NaN')

#Now calculate the above for time invariant policies.
#Table of variables that are time invariant.

covariateTableFixed = pd.DataFrame(index=list(indexControl),columns=['mean','median','sd','neighbor_mean','neighbor_median','meanP','medianP','0.1SDP','MeanDiff/SD'])
covariateTableFixed = covariateTableFixed.reindex(list(indexControl))


fipsdict = dict(zip(rawdat['countyFIPS'],rawdat['neighborFIPS']))

for m in list(indexControl):
	print(m)
	policyData = pd.DataFrame(index=allfips,columns=['treated','neighbor'])
	if m == 'trigger':
		for f in allfips:
			if len(rawdat.loc[(rawdat['countyFIPS']==f) & (rawdat['GrowthFatRate']>0),:])>0 and len(rawdat.loc[(rawdat['countyFIPS']==fipsdict[f]) & (rawdat['GrowthFatRate']>0),:])>0:
				policyData.loc[f,'treated'] = (rawdat.loc[(rawdat['countyFIPS']==f) & (rawdat['GrowthFatRate']>0),['Date']].min() - pd.to_datetime('2020-01-01')).dt.days[0]
				policyData.loc[f,'neighbor'] = (rawdat.loc[(rawdat['countyFIPS']==fipsdict[f]) & (rawdat['GrowthFatRate']>0),['Date']].min() - pd.to_datetime('2020-01-01')).dt.days[0]
	else:
		for f in allfips:
			policyData.loc[f,'treated'] = rawdat.loc[rawdat['countyFIPS']==f,m].median()
			policyData.loc[f,'neighbor'] = rawdat.loc[rawdat['countyFIPS']==fipsdict[f],m].median()
	#Lose some FIPS codes if they are missing a variable. For example, never had a fatality.
	policyData.dropna(inplace=True)	
	policyData = policyData.astype(float)
	meanTreat = policyData['treated'].mean()
	SDTreat = policyData['treated'].std()
	medianTreat = policyData['treated'].median()
	meanNeighbor = policyData['neighbor'].mean()
	medianNeighbor = policyData['neighbor'].median() 
	meanPval = ttest_1samp(policyData['treated']-policyData['neighbor'],0)[1]
	medianPval = median_test(policyData['treated'],policyData['neighbor'])[1]
	meanPvalSD = ttest_1samp(policyData['treated']-policyData['neighbor'],np.sign(meanTreat-meanNeighbor)*SDTreat/10)[1]
	covariateTableFixed.loc[m,'mean'] = meanTreat
	covariateTableFixed.loc[m,'median'] = medianTreat
	covariateTableFixed.loc[m,'sd'] = SDTreat
	covariateTableFixed.loc[m,'neighbor_mean'] = meanNeighbor
	covariateTableFixed.loc[m,'neighbor_median'] = medianNeighbor
	covariateTableFixed.loc[m,'meanP'] = meanPval
	covariateTableFixed.loc[m,'medianP'] = medianPval
	covariateTableFixed.loc[m,'MeanDiff/SD'] = (meanTreat-meanNeighbor)/SDTreat
	covariateTableFixed.loc[m,'0.1SDP'] = 1-meanPvalSD



covariateTableFixed.rename(indexControl,inplace=True)
covariateTableFixed.to_csv(revisedir+'CovariatePairTable.csv',index=True,header=True,na_rep='NaN')


